Link to Git repo: https://github.com/nktang05/kibera.git
Link to Analysis: https://nktang05.github.io/kibera/KiberaAnalysis.html
# dafsdasdf
print("hello world")
## [1] "hello world"
# read in data
setwd("~/Desktop/GRIT/Kibera")
data <- fread("~/Desktop/GRIT/Kibera/kibera_values_data.csv", header = TRUE)
# get rid on unnecessary data columns
data$StartDate<- NULL
data$EndDate<- NULL
data$Status<- NULL
data$IPAddress<- NULL
data$Progress<- NULL
data$'Duration (in seconds)'<- NULL
data$Finished<- NULL
data$ RecordedDate<- NULL
data$ResponseId<- NULL
data$RecipientLastName<- NULL
data$RecipientFirstName<- NULL
data$RecipientEmail<- NULL
data$ExternalReference<- NULL
data$LocationLatitude<- NULL
data$LocationLongitude<- NULL
data$DistributionChannel<- NULL
data$UserLanguage<- NULL
data$"2.11_7_TEXT"<- NULL
data$"2.13_7_TEXT"<- NULL
data$"2.20_5_TEXT"<- NULL
data$"3.2_8_TEXT"<- NULL
data$"3.16_6_TEXT"<- NULL
data$"4.21_5_TEXT"<- NULL
data$"5.1_5_TEXT"<- NULL
data$"5.12_6_TEXT"<- NULL
#set aside variable labels
variable_labels <- as.character(unlist(data[1, ]))
# drop non data rows
data <- data[-c(1, 2), ]
# change names of
names(data) <- ifelse(grepl("^[0-9]", names(data)),
paste0("x", names(data)),
names(data))
# make var numeric
numericVars <- c("x1.1", "x1.2", "x1.3", "x2.1", "x3.1_1_TEXT", "x3.9" )
for (col in numericVars) {
data[[col]] <- as.numeric(as.character(data[[col]]))
}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
# make var date
data$x1.4 <- as.Date(data$x1.4, format = "%d/%m/%Y")
# make var string
charVars <- c("x2.5_5_TEXT" , "x2.7_6_TEXT", "x2.10_8_TEXT", "x2.12_10_TEXT", "x2.14_5_TEXT", "x2.16_7_TEXT", "x2.21_9_TEXT", "x3.3_7_TEXT", "x3.13_8_TEXT",
"x3.14_7_TEXT", "x3.17_7_TEXT", "x4.10_1_TEXT", "x7.9_6_TEXT")
for (col in charVars) {
data[[col]] <- as.character(data[[col]])
}
#for (col in factorVars) {
#data[[col]] <- as.factor(data[[col]])
#}
#names(data)
#summary(data)
# delete over 20 and under 13
data <- data[data$'x2.1' <= 20, ]
data <- data[data$'x2.1' >= 13, ]
# Remove rows where gender is NA or an empty string
data <- data[!is.na(`x2.2`) & `x2.2` != "", ]
# delete in age is is NA or an empty string
data <- data[!is.na(`x2.1`) & `x2.1` != "", ]
#CHECK FOR MALE CONDITIONALS
#change pregnant to NA if indicated Yes and Male
data$'x4.8'[data$'x2.2' == 2 & data$'x4.8' == 1] <- NA
#change menstual to NA if indicated Yes and Male
data$'x3.1'[data$'x2.2' == 2 & (data[["x3.1"]] == 1 | data[["x3.1"]] == 2)] <- NA
# change menstrual age to NA if age and Male
data[["x3.1_1_TEXT"]][data[["x2.2"]] == 2 & data[["x3.1_1_TEXT"]] != ""] <- NA
# change menstrual predict to NA if age and Male
data[["x3.2"]][data[["x2.2"]] == 2 & data[["x3.2"]] != ""] <- NA
data[["x3.3"]][data[["x2.2"]] == 2 & data[["x3.3"]] != ""] <- NA
data[["x3.4"]][data[["x2.2"]] == 2 & data[["x3.4"]] != ""] <- NA
data[["x3.5"]][data[["x2.2"]] == 2 & data[["x3.5"]] != ""] <- NA
data[["x3.6"]][data[["x2.2"]] == 2 & data[["x3.6"]] != ""] <- NA
data[["x3.7"]][data[["x2.2"]] == 2 & data[["x3.7"]] != ""] <- NA
#CHECK FOR SEX ACTIVITY CONDITIONALS
data[["x3.9"]][data[["x2.2"]] != 1 & data[["x3.9"]] != ""] <- NA
#query to see duplicate village numbers
sqldf("SELECT [x1.2], COUNT(*) as count
FROM data
GROUP BY [x1.2]
HAVING COUNT(*) > 1")
## x1.2 count
## 1 NA 18
## 2 1 2
## 3 17 2
## 4 30 3
## 5 202 2
## 6 205 2
## 7 207 2
## 8 208 2
## 9 209 2
## 10 210 2
## 11 211 2
## 12 265 2
## 13 270 2
## 14 271 2
## 15 436 2
## 16 444 2
## 17 451 2
## 18 456 2
## 19 543 3
## 20 607 2
#query checker for if male said they were pregnant
sqldf("SELECT [x2.2], [x4.8] , [x1.2]
FROM data
WHERE [x2.2] = 2 AND [x4.8] = 1")
## [1] x2.2 x4.8 x1.2
## <0 rows> (or 0-length row.names)
#query for male menstration
sqldf("SELECT [x2.2], [x3.1]
FROM data
WHERE [x2.2] = 2 AND ([x3.1] = 1 OR [x3.1] = 2)")
## [1] x2.2 x3.1
## <0 rows> (or 0-length row.names)
#query for male menstration age
sqldf("SELECT [x2.2], [x3.1_1_TEXT]
FROM data
WHERE [x2.2] = 2 AND [x3.1_1_TEXT] != ''")
## [1] x2.2 x3.1_1_TEXT
## <0 rows> (or 0-length row.names)
sqldf("SELECT [x2.2], [x3.2]
FROM data
WHERE [x2.2] = 2 AND [x3.2] != ''")
## [1] x2.2 x3.2
## <0 rows> (or 0-length row.names)
#3.3,3.4,3.5,3.6,3.7
sqldf("SELECT [x2.2], [x3.7]
FROM data
WHERE [x2.2] = 2 AND [x3.7] != ''")
## [1] x2.2 x3.7
## <0 rows> (or 0-length row.names)
# query for sex conditionals
sqldf("SELECT [x3.8], [x3.9]
FROM data
WHERE [x3.8] != 1 AND [x3.9] != ''")
## [1] x3.8 x3.9
## <0 rows> (or 0-length row.names)
#make variable codebook
codebook_output <- codebook(data)
# make label variables
for (i in seq_along(data)) {
var_label(data[[i]]) <- variable_labels[i]
}
#test for success
#var_label(data)
# THIS IS WHERE MORE LABELS NEED TO BE HARDCODED IN
data$x2.2 <- labelled(
x = as.integer(data$x2.2),
labels = c("Female" = 1, "Male" = 2)
)
# make factor variables after codebook
allVars <- names(data)
excludeVars <- c(numericVars, charVars, "x1.4")
factorVars <- setdiff(allVars, excludeVars)
for (col in factorVars) {
data[[col]] <- to_factor(data[[col]], levels = "labels")
}
#check
#class(data$x2.2)
#levels(data$x2.2)
#summary(data)
#summary(data$x2.2)
# write new csv of clean data
fwrite(data, "kibera_values_cleaned.csv")
#write codebook
saveRDS(data, file = "codebook.rds")